# Load packages
library(here)
library(dplyr)
library(readr)
library(rstan)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(rstanarm)
library(janitor)
library(tidybayes)
library(broom.mixed)
library(here)
library(sf)
library(tidycensus)
library(openxlsx)
library(s2)
# themes
theme_set(theme_minimal())
vari_names <- read_csv(here("clean_data", "nyc_names.csv"))
nyc_clean <- st_read(here("clean_data", "nyc_data.shp"), crs = 4269, quiet=T) 
colnames(nyc_clean) <- colnames(vari_names)


library(openxlsx)
nta_to_census <- openxlsx::read.xlsx(here("ethnic", "Data", "census_to_nta.xlsx")) %>%
  dplyr::select(BoroName, NTACode) %>%
  rename(borough = BoroName,
         nta_id = NTACode) %>%
  unique()

nyc_clean <- nyc_clean %>%
  merge(., nta_to_census, by="nta_id") %>%
  mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("No Access", "Limited Access", "Satisfactory Access", "Excellent Access")))
subway_stations <- st_read(here("ethnic","Data","stations", "geo_export_85568705-efba-4456-bdc0-3d70ff2cf8e5.shp"), quiet=T)  %>%
  st_transform(., 4269)

bus_stations <- st_read(here("ethnic","Data","bus", "bus_stops_nyc_may2020.shp"), quiet=T)  %>%
  st_transform(., 4269)

transit_points <- read_csv(here("transit","ridership_points.csv"))%>%
  separate(Position, into=c("Point", "longitude", "latitude"), " ") %>%
  mutate(latitude = str_remove_all(latitude, "[)]"),
         longitude = str_remove_all(longitude, "[()]"),
         ) %>%
  dplyr::select(-c(Point)) %>% 
  mutate(latitude = as.numeric(latitude),
         longitude = as.numeric(longitude)) %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4269)
#plot locations over map
subway_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = subway_stations, color="#3F123C", size=1) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_loc <- ggplot() +
  geom_sf(data = nyc_clean, fill = "#EBF6FF", color = "#D48DD8", size = 0.15, alpha = .8) +
  geom_sf(data = bus_stations, color="#3F123C", size=.5, alpha=.5) + 
  coord_sf(datum = st_crs(subway_stations)) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Locations \nin NYC")+ 
    theme(#panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = sub_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Subway Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

bus_stops <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = bus_count), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Number of Bus Stops") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Bus Stop Counts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))


ridership <- nyc_clean%>%
  ggplot() +
  geom_sf(aes(fill = log2(mean_ridership)), color = "#8f98aa") +
  scale_fill_gradient(low= "lavender", high = "maroon",
                      guide = guide_legend(title = "Log2 Mean Ridership") ,na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean (Log2) Subway Turnstile \nRidership in 2018 \nfor NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

access <- nyc_clean %>%
  ggplot() +
  geom_sf(aes(fill = transportation_desert_4cat), color = "#8f98aa") +
  scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility Category"), na.value="#D6D6D6") +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Subway Deserts \nin NYC")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 30, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

`

red <- ggplot(nyc_clean) +
  geom_sf(aes(fill = below_poverty_line_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#E13728", guide = guide_legend(title = "Number Below \nPoverty Line")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Impoverishement")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

yellow <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_income), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#F3D24E", guide = guide_legend(title = "Mean Income")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Income")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

teal <- ggplot(nyc_clean) +
  geom_sf(aes(fill = mean_rent), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#2DBDC7", guide = guide_legend(title = "Dollars")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Mean Rent")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

purple <- ggplot(nyc_clean) +
  geom_sf(aes(fill = eviction_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#7826C0", guide = guide_legend(title = "Number of Evictions")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Evictions")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

orange <- ggplot(nyc_clean) +
  geom_sf(aes(fill = unemployment_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#FC9228", guide = guide_legend(title = "Number on \nUnemployment")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Unemployment")+
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12),
          legend.text = element_text(size = 12)) +
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

green <- ggplot(nyc_clean) +
  geom_sf(aes(fill = store_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#326902", guide = guide_legend(title = "Number of Stores")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Retail Food Stores")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

blue <- ggplot(nyc_clean) +
  geom_sf(aes(fill = school_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#5372C4", 
                      guide = guide_legend(title = "Number of Schools")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Number of Schools")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

pink <- ggplot(nyc_clean) +
  geom_sf(aes(fill = uninsured_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#F450E1", guide = guide_legend(title = "Number of People \n without Insurance Coverage")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Insurance Coverage")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))
white <- ggplot(nyc_clean) +
  geom_sf(aes(fill = white_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#7B435B", guide = guide_legend(title = "Number White")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("White Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

black <- ggplot(nyc_clean) +
  geom_sf(aes(fill = black_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#F25F5C", guide = guide_legend(title = "Number Black")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Black Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

asian <- ggplot(nyc_clean) +
  geom_sf(aes(fill = asian_count), color = "#8f98aa") +
  scale_fill_gradient(low = "#FCF5EE", high = "#717EC3", guide = guide_legend(title = "Number Asian")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Asian Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

latinx <- ggplot(nyc_clean) +
  geom_sf(aes(fill = latinx_count), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#FC9A38", guide = guide_legend(title = "Number Latinx")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Latinx Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 24, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

pop <- ggplot(nyc_clean) +
  geom_sf(aes(fill = total_pop), color = "#8f98aa")+
  scale_fill_gradient(low = "#FCF5EE", high = "#81CF97", guide = guide_legend(title = "Number of People")) +
  theme_minimal() +
  theme(panel.grid.major = element_line("transparent"),
        axis.text = element_blank()) +
  ggtitle("Total Population")+ 
    theme(panel.grid.major = element_line("transparent"),
          plot.title = element_text(size = 26, face = "bold"),
          legend.title = element_text(size = 12), 
          legend.text = element_text(size = 12)) + 
    guides(shape = guide_legend(override.aes = list(size = 8)),
           color = guide_legend(override.aes = list(size = 8)))

Data Introduction

All the data used in this project are from two major sources: the Tidycensus package and NYC Open Data.

Tidycensus is an R package interface, developed by Kyle Walker and Matt Herman, that enables easy access to the US Census Bureau’s data APIs and returns Tidyverse-ready data frames from various major US Census Bureau datasets. Our demographic and socioeconomic data are drawn from the American Community Survey results found in Tidycensus package. A summary of our ACS data variables is below:

  • borough: Each Neighborhood’s Borough.
  • total_pop: Total Population by Neighborhood
  • mean_income: Mean Income by Neighborhood
  • below_poverty_line_count: Number of People Living Below the 100% Poverty Line by Neighborhood
  • mean_rent: Mean Rent by Neighborhood
  • unemployment_count: Number of People on Unemployment by Neighborhood
  • latinx_count: Number of Latinx People by Neighborhood
  • white_count: Number of White People by Neighborhood
  • black_count: Number of Black People by Neighborhood
  • native_count: Number of Native People by Neighborhood
  • asian_count: Number of Asian People by Neighborhood
  • naturalized_citizen_count: Number of Naturalized Citizens by Neighborhood
  • noncitizen_count: Number of Foreign Born People by Neighborhood
  • uninsured_count: Number of Uninsured Citizens of any Age by Neighborhood

For remaining predictors, we used NYC Open Data’s portal to identify specific predictors. In particular, we used geotagged locations of Subway Stops, Bus Stops, Grocery Stores, Schools, and Eviction Sites from the Departments of Transportation, Health, Education, and Housing to calculate neighborhood-specific variables described below:

  • school_count: Number of Public Schools by Neighborhood
  • eviction_count: Number of Evictions by Neighborhood
  • store_count: Number of Grocery Stores and Food Vendors by Neighborhood
  • sub_count: Number of Subway Stations by Neighborhood
  • bus_count: Number of Bus Stations by Neighborhood
  • perc_covered_by_transit: Percent of Neighborhood Within Walking Distance (.25 miles) of Any Subway Stop.
  • transportation_desert_4cat: Subway Accessibility by Neighborhood (None, Limited, Satisfactory, Excellent)

Lastly, we acquired subway ridership from Metropolitan Transportation Authority’s turnstile data for the week of September 7, 2019. For each station, entry/exit of each turnstile is recorded. Then, we aggregated this information by taking the station-specific average of subway ridership across the 7 days in the week. Finally, we geotagged each listed station, then took the mean of ridership at all subway stations in each neighborhood to create.

  • mean_ridership: Mean Subway Ridership by Neighborhood for the week of September 7th.







Data Summaries

Our data has 224 observations of 26 variables. Below is a preview of our dataset with colnames attached.

library(kableExtra)
kable(head(nyc_clean, n=3)) %>% kable_styling() %>% scroll_box()
nta_id total_pop mean_income below_poverty_line_count below_poverty_line_and_50_count mean_rent unemployment_count latinx_count white_count black_count native_count asian_count naturalized_citizen_count noncitizen_count uninsured_count school_count eviction_count store_count sub_count bus_count mean_ridership perc_covered_by_transit transportation_desert_3cat transportation_desert_4cat borough geometry
BK0101 26308 98338.67 2397 1289 2062.667 582 3284 20526 482 40 1052 3777 3129 1797 6 68 71 2 53 9410.500 28.36395 Satisfactory Access Limited Access Brooklyn MULTIPOLYGON (((-73.94074 4…
BK0102 57774 101238.92 9120 4474 2330.077 1710 18227 32237 1460 0 4008 6802 7746 3725 12 204 129 2 97 26603.000 55.26492 Satisfactory Access Satisfactory Access Brooklyn MULTIPOLYGON (((-73.96355 4…
BK0103 36891 30309.25 18285 5970 1194.875 457 3351 31799 1288 20 194 3548 1012 711 6 45 58 3 35 6348.667 87.71022 Satisfactory Access Satisfactory Access Brooklyn MULTIPOLYGON (((-73.96762 4…

Below is a numeric summary of each variable’s distribution.

#summary(nyc_clean)
library(table1)
table_print <- table1(~ total_pop + mean_income + below_poverty_line_count+ 
         mean_rent + unemployment_count + white_count + uninsured_count + school_count + eviction_count + store_count + transportation_desert_4cat+ sub_count + bus_count + mean_ridership | borough, data = nyc_clean %>% as.tibble())  %>% as.tibble() 

colnames(table_print) <- c("Variable", "Bronx (N=44)", "Brooklyn (N=64)","Manhattan (N=39)", "Queens (N=77)", "Overall (N=224)")

table_print%>%
  filter(Variable!="") %>% kable() %>% kable_styling() %>% scroll_box()
Variable Bronx (N=44) Brooklyn (N=64) Manhattan (N=39) Queens (N=77) Overall (N=224)
total_pop
  Mean (SD) 30200 (18700) 37800 (24600) 38300 (24600) 25900 (20900) 32300 (22800)
  Median [Min, Max] 29800 [0, 69200] 38300 [0, 97800] 35700 [0, 95300] 25000 [0, 87700] 31600 [0, 97800]
mean_income
  Mean (SD) 43400 (17900) 69600 (28700) 103000 (49700) 74500 (14400) 71900 (33900)
  Median [Min, Max] 38000 [23100, 94200] 61200 [27400, 148000] 108000 [33300, 212000] 72600 [37500, 104000] 67200 [23100, 212000]
  Missing 8 (18.2%) 9 (14.1%) 6 (15.4%) 19 (24.7%) 42 (18.8%)
below_poverty_line_count
  Mean (SD) 8360 (6490) 7430 (6120) 6180 (6100) 3090 (2900) 5900 (5700)
  Median [Min, Max] 7260 [0, 21600] 6880 [0, 28800] 3290 [0, 22800] 2680 [0, 11600] 4220 [0, 28800]
mean_rent
  Mean (SD) 1230 (197) 1580 (465) 1960 (674) 1650 (198) 1600 (465)
  Median [Min, Max] 1240 [833, 1620] 1450 [792, 3280] 2070 [884, 3270] 1630 [1140, 2250] 1510 [792, 3280]
  Missing 8 (18.2%) 9 (14.1%) 6 (15.4%) 19 (24.7%) 42 (18.8%)
unemployment_count
  Mean (SD) 1400 (972) 1180 (903) 1210 (1100) 756 (685) 1080 (916)
  Median [Min, Max] 1330 [0, 3150] 1120 [0, 3770] 952 [0, 4700] 668 [0, 3150] 919 [0, 4700]
white_count
  Mean (SD) 2830 (4990) 13900 (14200) 17200 (14700) 6740 (8500) 9850 (12300)
  Median [Min, Max] 919 [0, 27500] 10900 [0, 64500] 13800 [0, 69300] 4200 [0, 43900] 4960 [0, 69300]
uninsured_count
  Mean (SD) 2570 (1990) 2750 (2260) 2030 (2150) 2350 (2510) 2450 (2280)
  Median [Min, Max] 2340 [0, 8030] 2610 [0, 10100] 1380 [0, 10300] 1660 [0, 12200] 1910 [0, 12200]
school_count
  Mean (SD) 8.64 (7.44) 8.16 (6.79) 8.54 (7.79) 4.08 (2.88) 6.92 (6.41)
  Median [Min, Max] 5.50 [1.00, 27.0] 6.00 [1.00, 31.0] 5.00 [1.00, 28.0] 3.00 [1.00, 12.0] 5.00 [1.00, 31.0]
eviction_count
  Mean (SD) 438 (340) 245 (234) 223 (233) 126 (131) 238 (255)
  Median [Min, Max] 406 [1.00, 1130] 163 [1.00, 829] 152 [1.00, 1120] 93.0 [1.00, 521] 148 [1.00, 1130]
store_count
  Mean (SD) 53.2 (38.6) 69.8 (49.5) 58.6 (39.8) 33.1 (33.8) 52.0 (43.1)
  Median [Min, Max] 48.5 [1.00, 138] 70.5 [1.00, 202] 54.0 [1.00, 151] 24.0 [1.00, 147] 45.0 [1.00, 202]
transportation_desert_4cat
  No Access 5 (11.4%) 9 (14.1%) 2 (5.1%) 40 (51.9%) 56 (25.0%)
  Limited Access 21 (47.7%) 23 (35.9%) 6 (15.4%) 29 (37.7%) 79 (35.3%)
  Satisfactory Access 14 (31.8%) 24 (37.5%) 13 (33.3%) 8 (10.4%) 59 (26.3%)
  Excellent Access 4 (9.1%) 8 (12.5%) 18 (46.2%) 0 (0%) 30 (13.4%)
sub_count
  Mean (SD) 1.95 (1.33) 2.77 (2.06) 4.03 (3.53) 1.55 (1.22) 2.41 (2.23)
  Median [Min, Max] 1.00 [1.00, 7.00] 2.00 [1.00, 9.00] 3.00 [1.00, 17.0] 1.00 [1.00, 6.00] 1.00 [1.00, 17.0]
bus_count
  Mean (SD) 40.1 (22.9) 60.2 (41.1) 46.6 (26.0) 56.9 (45.0) 52.7 (38.0)
  Median [Min, Max] 43.5 [1.00, 125] 59.0 [1.00, 170] 44.0 [2.00, 106] 52.0 [1.00, 243] 49.0 [1.00, 243]
mean_ridership
  Mean (SD) 7180 (3410) 7400 (4690) 22900 (19500) 10900 (12200) 12000 (13300)
  Median [Min, Max] 6670 [2420, 15700] 6610 [1040, 26600] 18000 [5640, 110000] 7920 [273, 55700] 7960 [273, 110000]
  Missing 21 (47.7%) 18 (28.1%) 7 (17.9%) 54 (70.1%) 100 (44.6%)







Data Visuals

Non-Spatial

library(ggridges)
plot_1<-nyc_clean %>% 
  ggplot(aes(x=mean_income, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Income", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_2<-nyc_clean %>% 
  ggplot(aes(x=below_poverty_line_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Number Below Poverty Line", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_3<-nyc_clean %>% 
  ggplot(aes(x=mean_rent, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Rent", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_4<-nyc_clean %>% 
  ggplot(aes(x=unemployment_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Unemployed Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_5<-nyc_clean %>% 
  ggplot(aes(x=white_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="White Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_6<-nyc_clean %>% 
  ggplot(aes(x=uninsured_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Uninsured Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))


plot_7<-nyc_clean %>% 
  ggplot(aes(x=school_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="School Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_8<-nyc_clean %>% 
  ggplot(aes(x=eviction_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Eviction Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_9<-nyc_clean %>% 
  ggplot(aes(x=store_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Food Retail Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_10<-nyc_clean %>% 
  ggplot(aes(x=borough, fill=transportation_desert_4cat), alpha=.6) +
  geom_bar(position="fill") +
  scale_y_continuous(labels = seq(0, 100, by = 25)) +
  labs(title="Subway Accessibility", y="", x="")+
    theme(panel.grid.major = element_line("transparent"),
         # axis.text.y.left = element_blank(),
          axis.text.x.bottom = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold")) +
   scale_fill_manual(values=c("#a45371","#e5b6c7","#ebebf7","#89a2d1"),
                       guide = guide_legend(title = "Subway Accessibility"), na.value="#D6D6D6") 

plot_11<-nyc_clean %>% 
  ggplot(aes(x=sub_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Subway Stop Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))


plot_12<-nyc_clean %>% 
  ggplot(aes(x=bus_count, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Bus Stop Counts", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))

plot_13<-nyc_clean %>% 
  ggplot(aes(x=mean_ridership, y=borough, fill=borough), alpha=.6) +
  geom_density_ridges() +
  labs(title="Mean Ridership", y="")+
    theme(panel.grid.major = element_line("transparent"),
          axis.text.y.left = element_text(size = 16, face = "bold"),
          plot.title = element_text(size = 28,hjust=.5, face = "bold"),
          legend.position="none") +
  scale_fill_manual(values=c("#e09f3e","#16bac5","#717ec3","#5da271"))
library(egg)
ggarrange(plot_1, plot_2, plot_3, 
          plot_4, plot_5, plot_6, 
          plot_7, plot_8, plot_9, plot_11, plot_12, 
          plot_13, plot_10,
          ncol=4)

Spatial

Ridership

library(egg)
ggarrange(subway_loc, bus_loc, stops, bus_stops, ridership, access, ncol=3)



Structural

ggarrange(red, orange, yellow, green, teal, blue, purple, pink, ncol=3)



Demographic

ggarrange(pop, white, black, latinx, asian, ncol=3)







Model Building

Models 1 & 2: Non-Hierarchical

We use poisson and negative binomial regression to model observed white counts, \(i\), using our \(k=10\) predictors:

  • mean_income
  • mean_rent
  • unemployment_count
  • sub_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{i} \mid \beta_{0}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{i}) \; \; \; \; \text{where} \log(\lambda_{i}) = \beta_{0} + \sum^{10}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \end{split}\]

poisson_non_hierarchical <- stan_glm(
  white_count ~ mean_income + mean_rent + 
    unemployment_count + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = poisson,
  chains = 4, iter = 5000*2, seed = 84735, refresh = 0
)
pp_check(poisson_non_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)
nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_non_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip() 

tidy(poisson_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchical Poisson - Model Summary") %>% 
  kable_styling()
Non-Hierarchical Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 3086.3085579 0.0044196 3059.5058347 3113.1479474
mean_income 0.0010568 0.0000000 0.0010475 0.0010659
mean_rent -0.0256000 0.0000033 -0.0262373 -0.0249529
unemployment_count 0.0332760 0.0000018 0.0329284 0.0336202
transportation_desert_4catLimited Access 92.7764414 0.0026721 91.7703435 93.7970286
transportation_desert_4catSatisfactory Access 68.6455891 0.0030239 67.6372347 69.6760191
transportation_desert_4catExcellent Access 71.6781227 0.0034330 70.5339718 72.8578763
school_count 0.7569529 0.0001606 0.7249689 0.7886982
store_count 1.0934235 0.0000287 1.0877705 1.0990587
bus_count 0.3622793 0.0000246 0.3573912 0.3672348
eviction_count -0.3311558 0.0000065 -0.3324281 -0.3298777
uninsured_count -0.0072184 0.0000005 -0.0073205 -0.0071140

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \; \; \; \; \text{where} \log(\mu_{i}) = \beta_{0} + \sum^{11}_{k=1}X_{ik}\beta_k \\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \end{split}\]

prior_summary(poisson_non_hierarchical)
## Priors for model 'poisson_non_hierarchical' 
## ------
## Intercept (after predictors centered)
##  ~ normal(location = 0, scale = 2.5)
## 
## Coefficients
##   Specified prior:
##     ~ normal(location = [0,0,0,...], scale = [2.5,2.5,2.5,...])
##   Adjusted prior:
##     ~ normal(location = [0,0,0,...], scale = [7.4e-05,5.4e-03,3.0e-03,...])
## ------
## See help('prior_summary.stanreg') for more details
negbin_non_hierarchical <- stan_glm(
  white_count ~  mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count,
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 4, iter = 5000*2, seed = 84735, refresh = 0
)
pp_check(negbin_non_hierarchical) + 
  xlab("White Resident Count") +
  xlim(0,100000)+
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_non_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip()

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1672.0665352 0.4722233 656.7854013 4238.0843477
mean_income 0.0015311 0.0000065 0.0002738 0.0028136
unemployment_count 0.0445480 0.0001868 0.0076146 0.0830073
transportation_desert_4catLimited Access 116.5599826 0.2303102 36.3798608 239.6150291
transportation_desert_4catSatisfactory Access 122.9881362 0.2645798 32.9433959 276.5013767
transportation_desert_4catExcellent Access 90.3849788 0.3074693 4.3775862 251.4537682
store_count 0.8395041 0.0031393 0.2267540 1.4595117
bus_count 0.6231777 0.0030818 0.0245491 1.2541492
eviction_count -0.3295643 0.0004917 -0.4263220 -0.2294745
table1 <- prediction_summary(model=poisson_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Poisson")
table2 <- prediction_summary(model=negbin_non_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Negative Binomial")

Negative binomial has much better error metrics.

Models 2 & 3: Hierarchy by Borough

We use poisson and negative binomial hierarchical regression to model observed white counts, \(i\), by boroughs \(j\), usin our \(k=11\) predictors. Note we are creating 3 dummy variables associated with transportation_desert_4cat:

  • mean_income
  • mean_rent
  • unemployment_count
  • transportation_desert_4cat
  • school_count
  • store_count
  • bus_count
  • eviction_count
  • uninsured_count

Poisson

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k & \sim \text{Pois}(\lambda_{ij}) \\ & \text{where} \log(\lambda_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ \sigma_0 & \sim Exp(1) \end{split}\]

poisson_hierarchical <- stan_glmer(
 white_count ~ mean_income + mean_rent + 
    unemployment_count  + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = poisson,
  chains = 4, iter = 5000*2, seed = 84735, refresh = 0
)
pp_check(poisson_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Poisson")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

library(kableExtra)

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_poisson <-  posterior_predict(
  poisson_hierarchical, newdata = nyc_predict_clean)

library(tidybayes)
library(bayesplot)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_poisson,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
  xaxis_text(angle = 90, hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip() 

tidy(poisson_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95) %>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchicahl Poisson - Model Summary") %>% 
  kable_styling()
Hierarchicahl Poisson - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 3641.6183605 0.2336629 1525.6615533 5846.1546479
mean_income 0.0008367 0.0000000 0.0008273 0.0008460
mean_rent -0.0257308 0.0000032 -0.0263634 -0.0251027
unemployment_count 0.0311196 0.0000018 0.0307712 0.0314614
transportation_desert_4catLimited Access 83.3231548 0.0028270 82.3342797 84.3406861
transportation_desert_4catSatisfactory Access 45.3638879 0.0032106 44.4562937 46.3000854
transportation_desert_4catExcellent Access 37.1632607 0.0036553 36.2030659 38.1496957
school_count 0.5155870 0.0001610 0.4841780 0.5475625
store_count 0.8827876 0.0000312 0.8767807 0.8889429
bus_count 0.5060502 0.0000264 0.5007953 0.5112620
eviction_count -0.3241652 0.0000071 -0.3255300 -0.3227897
uninsured_count -0.0052898 0.0000006 -0.0053997 -0.0051788

Negative Binomial

\[\begin{split} \text{White}_{ij} \mid \beta_{0j}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu_{ij}, r) \\ & \text{where} \log(\mu_{ij}) = \beta_{0j} + \sum^{11}_{k=1}X_{ijk}\beta_k \\ \beta_{0j} \mid \beta_0, \sigma_0 & \stackrel{ind}{\sim} N(\beta_0, \sigma_0^2)\\ \beta_{0c} & \sim N(0,2.5^2)\\ \beta_1 &\sim N(0, 0.0000738^2)\\ \beta_2 &\sim N(0, 0.00537^2)\\ \beta_3 &\sim N(0, 0.00298^2)\\ \beta_4 &\sim N(0, 5.281^2)\\ \beta_5 &\sim N(0, 5.375^2)\\ \beta_6 &\sim N(0, 6.7192^2)\\ \beta_7 &\sim N(0, 0.390^2)\\ \beta_8 &\sim N(0, 0.0633^2)\\ \beta_9 &\sim N(0, 0.0714^2)\\ \beta_{10} &\sim N(0, .00985^2)\\ \beta_{11} &\sim N(0, .00115^2)\\ r & \sim Exp(1) \\ \sigma_0 & \sim Exp(1) \end{split}\]

negbin_hierarchical <- stan_glmer(
 white_count ~ mean_income + mean_rent + 
    unemployment_count + 
    transportation_desert_4cat + school_count + 
    store_count + bus_count + 
    eviction_count + uninsured_count + (1 | borough),
  data = nyc_clean,
  family = neg_binomial_2,
  chains = 4, iter = 5000*2, seed = 84735, refresh = 0)
pp_check(negbin_hierarchical) + 
  xlab("White Resident Count") +
  labs(title = "Negative Binomial")+
  theme(plot.title =  element_text(face="bold", size=25, hjust=.5)) 

nyc_predict_clean <- nyc_clean %>%
  na.omit()
set.seed(84735)

predictions_negbin <-  posterior_predict(
  negbin_hierarchical, newdata = nyc_predict_clean)

ppc_intervals(nyc_predict_clean$white_count, yrep = predictions_negbin,
              prob_outer = 0.95) +
  ggplot2::scale_x_continuous(
    labels = nyc_predict_clean$nta_id,
    breaks = 1:nrow(nyc_predict_clean)) +
    xaxis_text(angle = 90,  hjust = 1) +
  theme_linedraw()+
  theme(panel.grid.major = element_line("transparent"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank())+
  coord_flip()

tidy(negbin_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1622.8876825 0.5743522 423.6718923 4886.7340181
mean_income 0.0013529 0.0000063 0.0001195 0.0026384
unemployment_count 0.0429078 0.0001925 0.0052225 0.0818612
transportation_desert_4catLimited Access 115.9874186 0.2364913 36.6932146 240.6738126
transportation_desert_4catSatisfactory Access 99.7306285 0.2826852 14.0744695 247.6489077
store_count 0.6933044 0.0032324 0.0385316 1.3493248
eviction_count -0.2828928 0.0005340 -0.3883515 -0.1760216

Comparison

table3 <- prediction_summary(model=poisson_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Poisson")
table4 <- prediction_summary(model=negbin_hierarchical, data=nyc_clean %>% na.omit())%>% 
  mutate(Model = "Hierarchichal Negative Binomial") 

rbind(table1, table2, table3, table4) %>%
  dplyr::mutate(model = Model, .before=1) %>%
  dplyr::select(-Model)%>% kable() %>% kable_styling()
model mae mae_scaled within_50 within_95
Poisson 3985.174 44.1674809 0.0166667 0.0250000
Negative Binomial 4029.460 0.4133455 0.6000000 0.9833333
Hierarchichal Poisson 3176.197 33.6621538 0.0500000 0.0666667
Hierarchichal Negative Binomial 4368.680 0.4050767 0.6500000 0.9916667

Using in-sample scaled MAE, it’s evident our negative binomial regression models, regardless of hierarchy, performed better than our poisson regression models. However, the differences between the two negative binomial models was neglible. So, we will more closely interpret the non-hierarchichal negative binomial regression model.

tidy(negbin_non_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.95)%>% 
  
  mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100), 
         conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100), 
         conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
  filter(conf.low   > 0 & conf.high > 0 | conf.low  < 0 & conf.high < 0) %>%
  kable(align = "c", caption = "Non-Hierarchichal Negative Binomial - Model Summary") %>% 
  kable_styling()
Non-Hierarchichal Negative Binomial - Model Summary
term estimate std.error conf.low conf.high
(Intercept) 1672.0665352 0.4722233 656.7854013 4238.0843477
mean_income 0.0015311 0.0000065 0.0002738 0.0028136
unemployment_count 0.0445480 0.0001868 0.0076146 0.0830073
transportation_desert_4catLimited Access 116.5599826 0.2303102 36.3798608 239.6150291
transportation_desert_4catSatisfactory Access 122.9881362 0.2645798 32.9433959 276.5013767
transportation_desert_4catExcellent Access 90.3849788 0.3074693 4.3775862 251.4537682
store_count 0.8395041 0.0031393 0.2267540 1.4595117
bus_count 0.6231777 0.0030818 0.0245491 1.2541492
eviction_count -0.3295643 0.0004917 -0.4263220 -0.2294745
(exp(.00001433250)-1)*100
## [1] 0.00143326

After removing predictors whose 95% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 6 remaining predictors of an arbitrary neighborhood’s white population count:

  • Mean income by neighborhood
  • Unemployment counts by neighborhood
  • Transportation desert status by neighborhood
  • Food Vendor counts by neighborhood
  • Bus station counts by neighborhood
  • Eviction counts by neighborhood.

Next, we interpret each predictor:

  • Mean Income: When controlling for all other predictors, 1 dollar increases in mean neighborhood rental prices are associated with approximately 0.00143% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.000330%, 0.00268%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.
  • Unemployment Count: When controlling for all other predictors, 1 dollar increases in unemployed people counts by neighborhood are associated with approximately 0.0493% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0140%, 0.0790%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Limited Subway Access: When controlling for all other predictors, a neighborhood with limited access to subway transit is expected to have approximately 125.753% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (51.508%, 269.541%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory access to subway transit is expected to have approximately 136.342% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (44.081%, 282.669%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent access to subway transit is expected to have approximately 98.186% greater white population counts than a neighborhood with no subway access at all. Further, there is a 95% probability that the relationship may be any value between (15.762%, 275.969%), indicating that there is almost certainly an increase in white population counts between these two neighborhoods, but it’s explicit magnitude may vary.
  • Food Vendor Count: When controlling for all other predictors, 1 store increases in the number of food vendors by neighborhood are associated with approximately 0.759% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.261%, 1.469%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Bus Stop Count: When controlling for all other predictors, 1 stop increases in the number of bus stop by neighborhood are associated with approximately 0.6516973% changes in the white population count. However, there is a 95% chance that the relationship may be any value between (0.0402706%, 1.1496164%), indicating that there is almost certainly a positive relationship between these two variables, but its magnitude may vary.
  • Eviction Count: When controlling for all other predictors, increases in the number of evictions by neighborhood are associated with approximately 0.340% decreases in the white population count. However, there is a 95% chance that the relationship may be any value between (-0.425%, -0.248%), indicating that there is almost certainly a negative relationship between these two variables, but its magnitude may vary.

If we intended to essentialize this list, transportation_desert_4, eviction_count, store_count seem to be the most informative predictors.







Next Steps

We intend to adjust our model specifications and reconsider the construction and inclusion of our predictors. For example, we may want to rethink how we’re making our transit-access variable, whether including car ownership or transit use, and what predictors we’re specifically using in our models.

Moving forward, we want to build some non-hierarchical models predicting subway and bus count using the demographic data (like median income, rent, percentage white population, etc) to see the flip-side of what we are looking at right now with our models above.

Our current hierarchical models attempt to capture a spatial interaction by borough, but we may extend this hierarchy to do census-tracts within neighborhoods, so as to unnderstand observed differences at a finer level. Further, we may include latitude and longitude variables into our non-grouped model to account for spatial relationships and avoid hierarchy altogether. However, we may use CARBayes to adjust for the spatial factors.







Participation

  • Sam Ding: Pulled and prepared subway ridership and access data, then harmonized transit data with existing neighborhood data.
  • Vichy Meas: Helped clean both transit and demographic data, scaffolded the checkpoint 3 responses, and helped plan out transit questions.
  • Freddy Barragan: Pulled and prepared grocery, bus stop, school, eviction, and census data, made exploratory visualizations by neighborhood, built/coded our proposed demographic models.
  • Juthi Dewan: Harmonized, cleaned, and joined all disparate datasets, prepared our data summaries, built/coded our proposed demographic models.